from sqlite3 import connect
import requests
from lxml import etree
import json
import re
from zhconv import convert


class GetJuZi():
    def __init__(self):
        self.url = 'https://www.yjbys.com/yulu/weimeiyulu/105249.html'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
        }
        res = self.common_get(self.url)
        help(self.hant_2_hans)
        self.xpth_handle(res)

    def common_get(self, url):
        res = requests.get(url, headers=self.headers)
        # res.encoding = "utf8"
        res.encoding = "gb2312"
        response = etree.HTML(res.text)
        return response

    def hant_2_hans(self,hant_str):
        '''
        Function: 将 hant_str 由繁体转化为简体
        '''
        return convert(hant_str, 'zh-hans')

    def xpth_handle(self, res):
        items = res.xpath("//div[@class='content']/p")
        title = res.xpath('/html/body/div[2]/div[1]/div/h1/text()')[0]
        print(len(items))
        items = items[3:]

        with open('text/'+title+'.txt', 'a', encoding='utf-8') as file:
            idx = 0
            for item in items:
                text = item.xpath('./text()')
                if len(text) > 0:
                    text = text[0].strip()
                    if text.count('、') > 0:
                        idx = idx+1
                        text = re.sub('[\d+、]', '', text)
                        print(self.hant_2_hans(text))
                        
                
                        file.write(str(idx) + '、'+self.hant_2_hans(text)+'\n')

        # with open('句子.json', 'w') as file_obj:
        #     content = []
        #     for item in items:
        #         text = item.xpath('./text()')
        #         if len(text) > 0:
        #             text = text[0].strip()
        #             text = re.sub('[\d+、]', '', text)
        #             content.append(text)

        #     json.dump(content, file_obj)


if __name__ == '__main__':
    res = GetJuZi()
    print('请求完毕')
